import pandas as pd
import sys
sys.path.append('..')
from data.unlabeled import WORLD_CITIES as wct
wct.head()
wct.isnull().sum()
len(wct[wct.columns].drop_duplicates()), len(wct[wct.columns]) # no duplicates
wct = wct.drop(columns=["city", "iso2", "iso3", "admin_name", "capital", "id"])
wct.columns
wct = wct.rename(columns={'city_ascii':'city'})
wct.columns
to_drop = wct[wct.population.isnull()]
to_drop
# dropping missing values from the dataset
wctc = wct.copy(deep=False)
wctc.dropna(inplace=True)
wctc.isnull().sum() # cleaned dataset
We need to verify if the null values that have been dropped are randomly distributed or there's a hidden pattern. In this way we see if all geographic areas are represented.
import geojson
import folium
data_to_plot = list(wctc.columns[3:5])
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
"""
Turn a dataframe containing point data into a geojson formatted python dictionary
df : the dataframe to convert to geojson
properties : a list of columns in the dataframe to turn into geojson feature properties
lat : the name of the column in the dataframe that contains latitude data
lon : the name of the column in the dataframe that contains longitude data
"""
# create a new python dict to contain our geojson data, using geojson format
geojson = {'type':'FeatureCollection', 'features':[]}
# loop through each row in the dataframe and convert each row to geojson format
for _, row in df.iterrows():
# create a feature template to fill in
feature = {'type':'Feature',
'properties':{},
'geometry':{'type':'Point',
'coordinates':[]}}
# fill in the coordinates
feature['geometry']['coordinates'] = [row[lon],row[lat]]
# for each column, get the value and add it as a new feature property
for prop in properties:
feature['properties'][prop] = row[prop]
# add this feature (aka, converted dataframe row) to the list of features inside our dict
geojson['features'].append(feature)
return geojson
geo = df_to_geojson(to_drop, data_to_plot, lat = "lat", lon = "lng")
m = folium.Map([9,9], zoom_start=2)
folium.GeoJson(geo).add_to(m)
# uncomment below to see the map
# m
to_drop[to_drop["country"]== "Malta"] # all null values
It looks like some countries have more missing information than others.
A big city is categorized as >500,000 residents in the given year, 2020.
big_cities = wctc.sort_values(by=["population"], ascending=False).head(1000)
big_cities.head()
big_cities.to_csv('bigcities.csv',index=False)
geo2 = df_to_geojson(big_cities, data_to_plot, lat = "lat", lon = "lng")
m2 = folium.Map([9,9], zoom_start=2)
folium.GeoJson(geo2).add_to(m2)
# uncomment below to see the map
# m2
We can see that some countries are not represented with this approach. For example, the African State of Namibia.
countries = list(wctc["country"].unique())
top_cities = pd.DataFrame(columns = wctc.columns)
top_cities = top_cities.append([wct[wct["country"] == country].sort_values(by=["population"], ascending=False).head(6) for country in countries], ignore_index = True)
top_cities
top_cities.to_csv('bigcities_allcountries.csv',index=False)
geo3 = df_to_geojson(top_cities, data_to_plot, lat = "lat", lon = "lng")
m3 = folium.Map([9, 9], zoom_start=2)
folium.GeoJson(geo3).add_to(m3)
# uncomment below to see the map
m3
In this way every country should be represented. Probably also those containing a lot of cities that have been dropped.